package sis.ra.yahooNewsSummary;

import java.io.File;

import sis.ra.utility.Utils;


public class getCNet {

	public static void extractHTMLfrmlinks ()
	{
		String file = "C:\\Documents and Settings\\I820753\\Desktop\\dataset\\cnet\\linklist.txt";
		String content=Utils.readFile(file);
		String [] links=content.split("\n");
		for (int i=0;i<links.length;i++)
		{
			System.out.println(links[i]);
			String contentpage=Utils.fetchContentfrURL(links[i]);
		//	System.out.println(contentpage);
			String filename="C:\\Documents and Settings\\I820753\\Desktop\\dataset\\cnet\\cnet"+i+".html";
			System.out.println(filename);
			Utils.writeToFile(filename, contentpage, true, true);
		}
	}
	
	public static String removeTag(String content)
	{
		String removetag="";
		removetag=content.replace("</P>", "").replace("<P>", "").replace("</p>", "").replace("<p>", "");
		String [] snip=removetag.split("<");
		String pas="";
		for (int i=0;i<snip.length;i++)
		{
			if (snip[i].contains(">"))
			{
				String [] snippets=snip[i].split(">");
				if (snippets.length>1)
				pas=pas+" "+snippets[1].trim();
			}
		}
	//	System.out.println(pas);
		pas=pas.replace("\n\n", "\n").replace("\n \n", "\n").replace("\n\n", "\n");
		return pas;
	}
	
	public static void main(String[] args) {
		String file = "C:\\Documents and Settings\\I820753\\Desktop\\dataset\\cnet\\";
		File f=new File (file);
		File [] files=f.listFiles();
		int count=0;
		for (int i=0;i<files.length;i++)
		{
			if (!files[i].toString().contains(".html")) continue;
			String content = Utils.readFile( files[i].toString());
	//		System.out.println(content);
			if (!content.contains("<div section=\"txt\">")) continue;
	//		if (!content.contains("<div section=\"tback\">"))
			if (!content.contains("<div section=\"tback\">")) continue;
			int start=content.indexOf("<div section=\"txt\">");
			int end=content.indexOf("<div section=\"tback\">");
			content=content.substring(start,end);
	//		System.out.println(files[i]);
			count=count+1;
	//		System.out.println(content);
			String newline=removeTag(content);
			String newfilename=files[i].toString().replace(".html", ".txt");
			System.out.println(newfilename+"\t"+content.length());
	//		Utils.writeToFile(newfilename, newline, true,true);
		}
		System.out.println(count);
	}

}
